{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a157ed64-7369-4d96-9296-c3c94e217b08",
   "metadata": {},
   "source": [
    "### Process and check output from CADD-SV \n",
    "\n",
    "* Reformat header of output bed files \n",
    "* Check that output variants match input variants from SV info file (inversions removed)\n",
    "* Merge with information in SV info file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2b94093a-85f2-4d95-9939-f4dcaeed3989",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import pandas as pd\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b4f8ff79",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "cadd_sv_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/cadd_sv\")\n",
    "cadd_sv_output_path = cadd_sv_dir.joinpath(\"output\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2de8bd16",
   "metadata": {},
   "outputs": [],
   "source": [
    "processed_output_dir = cadd_sv_dir.joinpath(\"processed\")\n",
    "processed_output_dir.mkdir(parents=True, exist_ok=True)\n",
    "scores_dir = cadd_sv_dir.joinpath(\"scores\")\n",
    "scores_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e9c98f57-d391-478e-85f7-8b0d9c6fc21a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# clean all CADD-SV output files\n",
    "output_bed_files = cadd_sv_output_path.glob(\"intrvl_svs_no_inv_*_score.bed\")\n",
    "for output_bed in output_bed_files: \n",
    "    bed_file = output_bed.name\n",
    "    bed_file_header = f\"{bed_file.split('.')[0]}.clean_header.{bed_file.split('.')[1]}\"\n",
    "    output_bed_clean_header = processed_output_dir.joinpath(bed_file_header)\n",
    "    with open(output_bed, \"r\") as f_in, open(output_bed_clean_header, \"w\") as f_out:\n",
    "        for line in f_in:\n",
    "            if line.startswith(\"##\"):\n",
    "                continue\n",
    "            if line.startswith('chr'): \n",
    "                header_list = line.split(\" \") \n",
    "                # add missing column \n",
    "                header_list.insert(6, \"Raw-Score-combined\")\n",
    "                updated_header = \"\\t\".join(header_list) + \"\\n\"\n",
    "                f_out.write(updated_header)\n",
    "            else:\n",
    "                f_out.write(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3428b834-1349-4656-bb6b-bb9365046608",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of variants with CADD-SV scores: 121042\n"
     ]
    }
   ],
   "source": [
    "# combine all processed output files \n",
    "clean_bed_list = []\n",
    "output_bed_files_clean = processed_output_dir.glob(\"intrvl_svs_no_inv_*_score.clean_header.bed\")\n",
    "for clean_bed in output_bed_files_clean:\n",
    "    clean_bed_list.append(pd.read_csv(clean_bed, sep=\"\\t\"))\n",
    "intrvl_cadd_sv_scores_df = pd.concat(clean_bed_list)\n",
    "# add variant ID \n",
    "intrvl_cadd_sv_scores_df[\"variant_id\"] = \"chr\" + intrvl_cadd_sv_scores_df.chr.astype(str) + \":\" + intrvl_cadd_sv_scores_df.start.astype(str) + \":\" + intrvl_cadd_sv_scores_df.end.astype(str) + \":\" + intrvl_cadd_sv_scores_df.type\n",
    "cadd_sv_vrnt_ids = set(intrvl_cadd_sv_scores_df.variant_id.unique())\n",
    "print(f\"Number of variants with CADD-SV scores: {len(cadd_sv_vrnt_ids)}\")\n",
    "\n",
    "# check all variants are included \n",
    "sv_info_df = pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "sv_info_no_inv_df = sv_info_df[sv_info_df.SVTYPE != \"INV\"].copy()\n",
    "sv_info_no_inv_df[\"SVTYPE\"] = sv_info_no_inv_df[\"SVTYPE\"].replace(\"MEI\", \"INS\")\n",
    "sv_info_no_inv_df[\"variant_id\"] = sv_info_no_inv_df.chr + \":\" + sv_info_no_inv_df.pos.astype(str) + \":\" + sv_info_no_inv_df.end.astype(str) + \":\" + sv_info_no_inv_df.SVTYPE \n",
    "sv_info_vrnt_ids = sv_info_no_inv_df.variant_id.tolist()\n",
    "if set(cadd_sv_vrnt_ids) != set(sv_info_vrnt_ids): \n",
    "    raise ValueError(\"CADD-SV output variants are not the same as input\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d70be731-f6f5-4c58-a20d-e3cf6608294a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# add variant information \n",
    "intrvl_cadd_sv_scores_info_df = pd.merge(sv_info_no_inv_df, \n",
    "                                         intrvl_cadd_sv_scores_df.drop(columns=[\"chr\", \"start\", \"end\", \"type\"]),  \n",
    "                                         how=\"inner\", \n",
    "                                         on=\"variant_id\")\n",
    "intrvl_cadd_sv_scores_info_df[\"SVTYPE\"] = intrvl_cadd_sv_scores_info_df[\"SVTYPE\"].replace(\"INS\", \"MEI\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "76e350e9-0eb2-422e-ac8b-86b806f49a6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "cadd_sv_score_info_path = scores_dir.joinpath(\"intrvl_svs_no_inv_121042_cadd_sv_info.tsv\")\n",
    "intrvl_cadd_sv_scores_info_df.to_csv(cadd_sv_score_info_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4181786f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
